1. Import packages

In [1]:
#General
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

#Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import plotly.express as px

#Feature Engineering
from scipy import stats
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, RobustScaler

#Modeling
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, StratifiedKFold, GridSearchCV, RandomizedSearchCV, KFold, RepeatedKFold
from sklearn.feature_selection import f_classif, VarianceThreshold, SelectKBest, f_regression
from sklearn.feature_selection import RFECV, f_classif, VarianceThreshold, SelectKBest, f_regression
from yellowbrick.model_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.model_selection import cross_val_score
from numpy import mean

from sklearn.ensemble import AdaBoostClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation,Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

2. Data Pre-processing

1) Read Data

In [2]:
# Read the data
df = pd.read_csv('data.csv')
columns = df.columns
print(df.shape)
print("total null values", df.isnull().sum().sum())
print("total potential duplicated rows", df.duplicated().sum())
(6819, 96)
total null values 0
total potential duplicated rows 0
In [3]:
# Data info overall
bankrupt, bankrupt_perc = (df["Bankrupt?"].value_counts(), round(df["Bankrupt?"].value_counts(normalize=True),2))
display(bankrupt, bankrupt_perc)
print("----------------------------------")
df.info()
0    6599
1     220
Name: Bankrupt?, dtype: int64
0    0.97
1    0.03
Name: Bankrupt?, dtype: float64
----------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6819 entries, 0 to 6818
Data columns (total 96 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   Bankrupt?                                                 6819 non-null   int64  
 1    ROA(C) before interest and depreciation before interest  6819 non-null   float64
 2    ROA(A) before interest and % after tax                   6819 non-null   float64
 3    ROA(B) before interest and depreciation after tax        6819 non-null   float64
 4    Operating Gross Margin                                   6819 non-null   float64
 5    Realized Sales Gross Margin                              6819 non-null   float64
 6    Operating Profit Rate                                    6819 non-null   float64
 7    Pre-tax net Interest Rate                                6819 non-null   float64
 8    After-tax net Interest Rate                              6819 non-null   float64
 9    Non-industry income and expenditure/revenue              6819 non-null   float64
 10   Continuous interest rate (after tax)                     6819 non-null   float64
 11   Operating Expense Rate                                   6819 non-null   float64
 12   Research and development expense rate                    6819 non-null   float64
 13   Cash flow rate                                           6819 non-null   float64
 14   Interest-bearing debt interest rate                      6819 non-null   float64
 15   Tax rate (A)                                             6819 non-null   float64
 16   Net Value Per Share (B)                                  6819 non-null   float64
 17   Net Value Per Share (A)                                  6819 non-null   float64
 18   Net Value Per Share (C)                                  6819 non-null   float64
 19   Persistent EPS in the Last Four Seasons                  6819 non-null   float64
 20   Cash Flow Per Share                                      6819 non-null   float64
 21   Revenue Per Share (Yuan ¥)                               6819 non-null   float64
 22   Operating Profit Per Share (Yuan ¥)                      6819 non-null   float64
 23   Per Share Net profit before tax (Yuan ¥)                 6819 non-null   float64
 24   Realized Sales Gross Profit Growth Rate                  6819 non-null   float64
 25   Operating Profit Growth Rate                             6819 non-null   float64
 26   After-tax Net Profit Growth Rate                         6819 non-null   float64
 27   Regular Net Profit Growth Rate                           6819 non-null   float64
 28   Continuous Net Profit Growth Rate                        6819 non-null   float64
 29   Total Asset Growth Rate                                  6819 non-null   float64
 30   Net Value Growth Rate                                    6819 non-null   float64
 31   Total Asset Return Growth Rate Ratio                     6819 non-null   float64
 32   Cash Reinvestment %                                      6819 non-null   float64
 33   Current Ratio                                            6819 non-null   float64
 34   Quick Ratio                                              6819 non-null   float64
 35   Interest Expense Ratio                                   6819 non-null   float64
 36   Total debt/Total net worth                               6819 non-null   float64
 37   Debt ratio %                                             6819 non-null   float64
 38   Net worth/Assets                                         6819 non-null   float64
 39   Long-term fund suitability ratio (A)                     6819 non-null   float64
 40   Borrowing dependency                                     6819 non-null   float64
 41   Contingent liabilities/Net worth                         6819 non-null   float64
 42   Operating profit/Paid-in capital                         6819 non-null   float64
 43   Net profit before tax/Paid-in capital                    6819 non-null   float64
 44   Inventory and accounts receivable/Net value              6819 non-null   float64
 45   Total Asset Turnover                                     6819 non-null   float64
 46   Accounts Receivable Turnover                             6819 non-null   float64
 47   Average Collection Days                                  6819 non-null   float64
 48   Inventory Turnover Rate (times)                          6819 non-null   float64
 49   Fixed Assets Turnover Frequency                          6819 non-null   float64
 50   Net Worth Turnover Rate (times)                          6819 non-null   float64
 51   Revenue per person                                       6819 non-null   float64
 52   Operating profit per person                              6819 non-null   float64
 53   Allocation rate per person                               6819 non-null   float64
 54   Working Capital to Total Assets                          6819 non-null   float64
 55   Quick Assets/Total Assets                                6819 non-null   float64
 56   Current Assets/Total Assets                              6819 non-null   float64
 57   Cash/Total Assets                                        6819 non-null   float64
 58   Quick Assets/Current Liability                           6819 non-null   float64
 59   Cash/Current Liability                                   6819 non-null   float64
 60   Current Liability to Assets                              6819 non-null   float64
 61   Operating Funds to Liability                             6819 non-null   float64
 62   Inventory/Working Capital                                6819 non-null   float64
 63   Inventory/Current Liability                              6819 non-null   float64
 64   Current Liabilities/Liability                            6819 non-null   float64
 65   Working Capital/Equity                                   6819 non-null   float64
 66   Current Liabilities/Equity                               6819 non-null   float64
 67   Long-term Liability to Current Assets                    6819 non-null   float64
 68   Retained Earnings to Total Assets                        6819 non-null   float64
 69   Total income/Total expense                               6819 non-null   float64
 70   Total expense/Assets                                     6819 non-null   float64
 71   Current Asset Turnover Rate                              6819 non-null   float64
 72   Quick Asset Turnover Rate                                6819 non-null   float64
 73   Working capitcal Turnover Rate                           6819 non-null   float64
 74   Cash Turnover Rate                                       6819 non-null   float64
 75   Cash Flow to Sales                                       6819 non-null   float64
 76   Fixed Assets to Assets                                   6819 non-null   float64
 77   Current Liability to Liability                           6819 non-null   float64
 78   Current Liability to Equity                              6819 non-null   float64
 79   Equity to Long-term Liability                            6819 non-null   float64
 80   Cash Flow to Total Assets                                6819 non-null   float64
 81   Cash Flow to Liability                                   6819 non-null   float64
 82   CFO to Assets                                            6819 non-null   float64
 83   Cash Flow to Equity                                      6819 non-null   float64
 84   Current Liability to Current Assets                      6819 non-null   float64
 85   Liability-Assets Flag                                    6819 non-null   int64  
 86   Net Income to Total Assets                               6819 non-null   float64
 87   Total assets to GNP price                                6819 non-null   float64
 88   No-credit Interval                                       6819 non-null   float64
 89   Gross Profit to Sales                                    6819 non-null   float64
 90   Net Income to Stockholder's Equity                       6819 non-null   float64
 91   Liability to Equity                                      6819 non-null   float64
 92   Degree of Financial Leverage (DFL)                       6819 non-null   float64
 93   Interest Coverage Ratio (Interest expense to EBIT)       6819 non-null   float64
 94   Net Income Flag                                          6819 non-null   int64  
 95   Equity to Liability                                      6819 non-null   float64
dtypes: float64(93), int64(3)
memory usage: 5.0 MB

2) Initial EDA

In [4]:
# Visulize outcome's distribution using barplot
fig = px.bar(x=df['Bankrupt?'].value_counts().index, 
             y=df['Bankrupt?'].value_counts(), 
             text=(df['Bankrupt?'].value_counts()/len(df['Bankrupt?'])*100),
             height=500, width=600, title='Bankrupcy')
fig.update_traces(textposition='outside', texttemplate='%{text:.4s}%', 
                  marker=dict(color='snow', line=dict(color='black', width=3)))
fig.show()
In [5]:
# Visulize variables' distribution using boxplots
plt.figure(figsize = (20,20))
ax = sns.boxplot(data= df, orient="h")
ax.set_title('Features Boxplots', fontsize = 18)
ax.set(xscale="log")
plt.show()
In [6]:
# Visulize numerical data's distribution using histograms
df.hist(figsize = (35,30), bins = 50)
plt.show()
In [7]:
# Correlation Heatmap (Spearman)
df.corr('spearman')["Bankrupt?"].sort_values() 
f, ax = plt.subplots(figsize=(30, 25))
mat = df.corr('spearman')  
mask = np.triu(np.ones_like(mat, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(mat, mask=mask, cmap=cmap, vmax=1, center=0,# annot = True,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.show()
In [8]:
# Analyze variance and hypothesis testing of all features
bankrupt_df = df[df['Bankrupt?']==True]
not_bankrupt_df = df[df['Bankrupt?']==False]

cols = df.drop("Bankrupt?", axis=1).columns

for feature in cols:
    a = bankrupt_df[feature]
    b = not_bankrupt_df[feature]
    b = b.sample(n=len(a), random_state=42) # Take random sample from each feature to match length of target

    test = stats.ttest_ind(a,b)   # Running t-tests
    plt.figure() 
    sns.distplot(bankrupt_df[feature], kde=True, label="Bankrupt")
    sns.distplot(not_bankrupt_df[feature], kde=True, label="Not Bankrupt") 
    plt.title("{} / p-value of t-test = :{}".format(feature, test[1]))
    plt.legend()

Data type and quality:

  • Data type and quality

    • There are 96 columns (95 input features + 1 output feature) in the dataset, and 6819 rows (=companies)
    • There is no missing data, no null values, no duplicated rows
  • Out of the 6819 companies in the dataset:

    • 6599 (97%) did not go bankrupt
    • 220 (3%) went bankrupt
  • First impressions:

    • The dataset is pretty clean, there is no need for extensive data cleaning.
    • Since the data represents financial ratios, there should be some features that are highly correlated.
    • The data is not normalized.
    • The dataset is imbalance. Thus, using accuracy as the evaluation metric is not relevant (as any model could achieve 97% accuracy by just predicting that every companies will not go bankrupt).
      • Instead, we could use F1 score instead.
      • Or we could use the SMOTE method to further balance the data.

3) Data cleaning

In [9]:
# Drop constant columns (if any)
var_thres = VarianceThreshold().fit(df)
constant_columns = [column for column in df.columns if column not in df.columns[var_thres.get_support()]]

for feature in constant_columns:
    print(feature)
    
df.drop(constant_columns,axis=1, inplace=True)
 Net Income Flag
In [10]:
# Normalize data for faster processing
def data_scaling(DataFrame):
    scaler = StandardScaler()
    DataFrame.iloc[:,1:] = scaler.fit_transform(DataFrame.iloc[:,1:])
    return(DataFrame)
df = data_scaling(df)

4) Split to train and test data

In [11]:
# split the whole dataset for training and testing purpose 
# the splited y_train and y_test are hold out data (real-world data)
X = df.drop(columns=["Bankrupt?"])
y = df["Bankrupt?"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

df_train = pd.concat([y_train, X_train], axis=1, join="inner")
df_test = pd.concat([y_test, X_test], axis = 1, join = "inner")
In [75]:
# Sort columns from the less correlated to the most correlated
#df_train_corr = df_train.corr()
#df_train_corr = df_train_corr.reindex(df_train_corr["Bankrupt?"].abs().sort_values(ascending=True).index).T
#column_names = np.array(df_train_corr.columns)
#df_train = df_train.reindex(columns=column_names)

5) Feature selection

1. Delete the features which have a high correlation between themselves but keep the one which is most relevant to y.

In [12]:
# Isolate the input features which have a high correlation between themselves
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(X_train, 0.7)
display(len(corr_features))
corr_features
34
Out[12]:
{' After-tax net Interest Rate',
 ' CFO to Assets',
 ' Cash Flow to Liability',
 ' Cash Flow to Sales',
 ' Cash Reinvestment %',
 ' Continuous interest rate (after tax)',
 ' Current Assets/Total Assets',
 ' Current Liabilities/Equity',
 ' Current Liability to Assets',
 ' Current Liability to Equity',
 ' Current Liability to Liability',
 ' Equity to Long-term Liability',
 ' Gross Profit to Sales',
 ' Liability to Equity',
 " Net Income to Stockholder's Equity",
 ' Net Income to Total Assets',
 ' Net Value Per Share (A)',
 ' Net Value Per Share (C)',
 ' Net Worth Turnover Rate (times)',
 ' Net profit before tax/Paid-in capital',
 ' Net worth/Assets',
 ' Operating Funds to Liability',
 ' Operating Profit Per Share (Yuan ¥)',
 ' Operating profit/Paid-in capital',
 ' Per Share Net profit before tax (Yuan ¥)',
 ' Persistent EPS in the Last Four Seasons',
 ' Pre-tax net Interest Rate',
 ' ROA(A) before interest and % after tax',
 ' ROA(B) before interest and depreciation after tax',
 ' Realized Sales Gross Margin',
 ' Regular Net Profit Growth Rate',
 ' Retained Earnings to Total Assets',
 ' Working Capital/Equity',
 ' Working capitcal Turnover Rate'}
In [13]:
# Drop the highly correlated features:
X_train.drop(corr_features, axis=1, inplace=True)
X_test.drop(corr_features, axis=1, inplace=True)
df_train.drop(corr_features,axis=1, inplace=True)
df_train.shape
Out[13]:
(4773, 61)

2. Use rfecv + different models to determine the final features.

* RFECV + linear SVM classifier
In [88]:
# Instantiate RFECV visualizer with a linear SVM classifier
visualizer_svc = RFECV(SVC(kernel='linear', C=1))

visualizer_svc.fit(X_train, y_train)        # Fit the data to the visualizer
visualizer_svc.show()           # Finalize and render the figure
Out[88]:
<matplotlib.axes._subplots.AxesSubplot at 0x135412490>
In [89]:
# Display features' names
most_relevent_cols = df.iloc[:, 1:].columns[np.where(visualizer_svc.support_ == True)]
print("Most relevant features based on linear SVM classifier are: ")
print(most_relevent_cols)
Most relevant features are: 
Index([' ROA(B) before interest and depreciation after tax',
       ' Operating Gross Margin', ' Research and development expense rate',
       ' Realized Sales Gross Profit Growth Rate',
       ' After-tax Net Profit Growth Rate', ' Net Value Growth Rate',
       ' Quick Ratio', ' Accounts Receivable Turnover', ' Revenue per person',
       ' Quick Assets/Total Assets', ' Current Liability to Assets'],
      dtype='object')
* RFECV + RandomForest Classifier
In [95]:
# Instantiate RFECV visualizer with a RandomForest classifier
cv = StratifiedKFold(5)
visualizer_rf = RFECV(RandomForestClassifier(), cv=cv, scoring='f1_weighted')

visualizer_rf.fit(X_train, y_train)       
visualizer_rf.show() 
Out[95]:
<matplotlib.axes._subplots.AxesSubplot at 0x12c69e9a0>
In [96]:
# Display features' names
most_relevent_cols = df.iloc[:, 1:].columns[np.where(visualizer_rf.support_ == True)]
print("Most relevant features based on RandomForest Classifier are: ")
print(most_relevent_cols)
Most relevant features based on RandomForest Classifier are: 
Index([' ROA(C) before interest and depreciation before interest',
       ' Operating Gross Margin', ' Continuous interest rate (after tax)',
       ' Persistent EPS in the Last Four Seasons',
       ' Operating Profit Per Share (Yuan ¥)',
       ' Realized Sales Gross Profit Growth Rate',
       ' After-tax Net Profit Growth Rate', ' Interest Expense Ratio',
       ' Debt ratio %', ' Long-term fund suitability ratio (A)',
       ' Operating profit/Paid-in capital', ' Accounts Receivable Turnover',
       ' Average Collection Days', ' Cash/Total Assets',
       ' Quick Assets/Current Liability', ' Cash/Current Liability',
       ' Current Liability to Assets'],
      dtype='object')
* RFECV + LogisticRegression
In [92]:
# Instantiate RFECV visualizer with a LogisticRegression

# Set number of folds
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1) 
# Minimum number of features to consider
min_features_to_select = 1 

visualizer_lr = RFECV(LogisticRegression(max_iter=1000), cv=cv, scoring='f1_weighted',
                      min_features_to_select=min_features_to_select,n_jobs=1)
visualizer_lr.fit(X_train, y_train)       
visualizer_lr.show() 
Out[92]:
<matplotlib.axes._subplots.AxesSubplot at 0x1354124f0>
In [93]:
# Display features' names
most_relevent_cols = df.iloc[:, 1:].columns[np.where(visualizer_lr.support_ == True)]
print("Most relevant features based on RandomForest Classifier are: ")
print(most_relevent_cols)
Most relevant features based on RandomForest Classifier are: 
Index([' ROA(C) before interest and depreciation before interest',
       ' ROA(B) before interest and depreciation after tax',
       ' Operating Gross Margin', ' Continuous interest rate (after tax)',
       ' Realized Sales Gross Profit Growth Rate', ' Total Asset Growth Rate',
       ' Net Value Growth Rate', ' Total Asset Return Growth Rate Ratio',
       ' Current Ratio', ' Quick Ratio',
       ' Long-term fund suitability ratio (A)',
       ' Accounts Receivable Turnover', ' Net Worth Turnover Rate (times)',
       ' Current Liability to Assets'],
      dtype='object')
* Combine all the relevant features from all the three models above.
In [14]:
feature_list = ["Bankrupt?",
                ' ROA(C) before interest and depreciation before interest', 
                ' Persistent EPS in the Last Four Seasons',
                ' ROA(B) before interest and depreciation after tax', 
                ' Debt ratio %', 
                ' Operating Gross Margin', 
                ' Research and development expense rate',
                ' Continuous interest rate (after tax)', 
                ' After-tax Net Profit Growth Rate',
                ' Realized Sales Gross Profit Growth Rate', 
                ' Total Asset Growth Rate',
                ' Net Value Growth Rate', 
                ' Total Asset Return Growth Rate Ratio',
                ' Current Ratio', 
                ' Quick Ratio', 
                ' Quick Assets/Total Assets', 
                ' Revenue per person',
                ' Long-term fund suitability ratio (A)',
                ' Accounts Receivable Turnover', 
                ' Net Worth Turnover Rate (times)',
                ' Current Liability to Assets', 
                ' Interest Expense Ratio',
                ' Operating Profit Per Share (Yuan ¥)', 
                ' Operating profit/Paid-in capital', 
                ' Average Collection Days']

final_train = df.loc[:, df.columns.isin(feature_list)]
round(final_train.describe(),2)
Out[14]:
Bankrupt? ROA(C) before interest and depreciation before interest ROA(B) before interest and depreciation after tax Operating Gross Margin Continuous interest rate (after tax) Research and development expense rate Persistent EPS in the Last Four Seasons Operating Profit Per Share (Yuan ¥) Realized Sales Gross Profit Growth Rate After-tax Net Profit Growth Rate ... Interest Expense Ratio Debt ratio % Long-term fund suitability ratio (A) Operating profit/Paid-in capital Accounts Receivable Turnover Average Collection Days Net Worth Turnover Rate (times) Revenue per person Quick Assets/Total Assets Current Liability to Assets
count 6819.00 6819.00 6819.00 6819.00 6819.00 6819.00 6819.00 6819.00 6819.00 6819.00 ... 6819.00 6819.00 6819.00 6819.00 6819.00 6819.00 6819.00 6819.00 6819.00 6819.00
mean 0.03 -0.00 -0.00 0.00 -0.00 0.00 0.00 -0.00 0.00 -0.00 ... -0.00 -0.00 0.00 0.00 -0.00 0.00 0.00 0.00 -0.00 -0.00
std 0.18 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 ... 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00
min 0.00 -8.33 -8.99 -35.90 -61.63 -0.75 -6.88 -3.90 -1.86 -49.75 ... -56.15 -2.10 -0.31 -3.92 -0.05 -0.04 -1.05 -0.02 -1.98 -1.80
25% 0.00 -0.47 -0.43 -0.44 0.01 -0.75 -0.42 -0.47 -0.03 0.01 ... -0.03 -0.75 -0.13 -0.46 -0.05 -0.04 -0.46 -0.02 -0.78 -0.74
50% 0.00 -0.04 -0.02 -0.12 0.02 -0.55 -0.13 -0.17 -0.03 0.02 ... -0.03 -0.03 -0.11 -0.17 -0.05 -0.04 -0.25 -0.02 -0.07 -0.16
75% 0.00 0.50 0.50 0.35 0.03 0.58 0.30 0.25 -0.02 0.04 ... 0.01 0.66 -0.07 0.25 -0.05 -0.04 0.12 -0.02 0.70 0.57
max 1.00 8.15 7.25 23.15 17.24 3.09 23.19 31.89 80.94 22.44 ... 32.84 16.45 35.21 32.07 34.96 37.92 26.21 64.47 2.97 18.08

8 rows × 25 columns

In [15]:
# Get the score on correlation
fX = final_train.drop(columns=["Bankrupt?"])
fy = final_train["Bankrupt?"]

select_features = SelectKBest(score_func=f_classif, k=10).fit(fX, fy)
select_features_kbest = pd.DataFrame({'Features': list(fX.columns),'Scores': select_features.scores_})
select_features_kbest.sort_values(by='Scores', ascending = False)
Out[15]:
Features Scores
1 ROA(B) before interest and depreciation after... 549.202093
0 ROA(C) before interest and depreciation befor... 497.535121
15 Debt ratio % 455.091151
5 Persistent EPS in the Last Four Seasons 345.267517
23 Current Liability to Assets 268.012458
6 Operating Profit Per Share (Yuan ¥) 140.388750
17 Operating profit/Paid-in capital 138.500753
2 Operating Gross Margin 68.918755
22 Quick Assets/Total Assets 51.249983
10 Net Value Growth Rate 29.219210
9 Total Asset Growth Rate 13.484166
21 Revenue per person 10.771201
8 After-tax Net Profit Growth Rate 9.745541
13 Quick Ratio 4.283205
4 Research and development expense rate 4.005214
20 Net Worth Turnover Rate (times) 3.033236
16 Long-term fund suitability ratio (A) 1.952151
11 Total Asset Return Growth Rate Ratio 1.937815
3 Continuous interest rate (after tax) 0.480453
19 Average Collection Days 0.292974
18 Accounts Receivable Turnover 0.154063
14 Interest Expense Ratio 0.048997
12 Current Ratio 0.033334
7 Realized Sales Gross Profit Growth Rate 0.001432
* Only keep 11 features in the end
In [16]:
# Remove the features with scores lower than 20
final_train.drop(' Total Asset Growth Rate', axis=1, inplace=True)
final_train.drop(' Revenue per person', axis=1, inplace=True)
final_train.drop(' After-tax Net Profit Growth Rate', axis=1, inplace=True)
final_train.drop(' Quick Ratio', axis=1, inplace=True)
final_train.drop(' Research and development expense rate', axis=1, inplace=True)
final_train.drop(' Net Worth Turnover Rate (times)', axis=1, inplace=True)
final_train.drop(' Long-term fund suitability ratio (A)', axis=1, inplace=True)
final_train.drop(' Total Asset Return Growth Rate Ratio', axis=1, inplace=True)
final_train.drop(' Continuous interest rate (after tax)', axis=1, inplace=True)
final_train.drop(' Average Collection Days', axis=1, inplace=True)
final_train.drop(' Accounts Receivable Turnover', axis=1, inplace=True)
final_train.drop(' Interest Expense Ratio', axis=1, inplace=True)
final_train.drop(' Current Ratio', axis=1, inplace=True)
final_train.drop(' Realized Sales Gross Profit Growth Rate', axis=1, inplace=True)

final_train.shape
Out[16]:
(6819, 11)
In [17]:
# final plot of correlation 
fig, ax = plt.subplots(figsize=(14,12))
sns.heatmap(final_train.corr(), vmin=-1, vmax=1, cmap=sns.diverging_palette(20, 220, as_cmap=True), annot=True)
Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x153badbb0>
In [18]:
# add bar&box plot for each numeric variable 
for column in final_train.columns:
    # set the figure size 
    plt.figure(figsize=(16,4))
    
    # draw the bar chart
    plt.subplot(1,2,1)
    sns.distplot(final_train[column])
    plt.xlabel(column)
    plt.ylabel('Density')
    plt.title(f'{column} Bar Distribution')
    
    # draw the box chart, group by "y"
    plt.subplot(1,2,2)
    sns.boxplot(x="Bankrupt?", y=column, data =final_train, showmeans=True )
    plt.xlabel("Bankrupt?")
    plt.ylabel(column)
    plt.title(f'{column} Box Distribution')

    plt.show()
    print()











6) Oversample the minority class, using the SMOTE method

In [19]:
# plot the imbalanced data again
plot_status_numberinit = final_train['Bankrupt?'].value_counts().plot(title = 'Bankrupt vs Bankrupt class', kind = 'barh', color = 'green')
plot_status_numberinit.set_xlabel("Bankrupt?")
plot_status_numberinit.set_ylabel("y class")
plt.show()
print(final_train['Bankrupt?'].value_counts())
0    6599
1     220
Name: Bankrupt?, dtype: int64
In [20]:
final_trainX = final_train.drop(columns=["Bankrupt?"])
final_trainy = final_train["Bankrupt?"]
In [21]:
# oversample the minority class, using the SMOTE method
oversample = SMOTE()
new_train_X, new_train_y = oversample.fit_resample(final_trainX, final_trainy)

plot_status_numberinit = new_train_y.value_counts().plot(title = 'Bankrupt vs Bankrupt class', kind = 'barh', color = 'green')
plot_status_numberinit.set_xlabel("Bankrupt")
plot_status_numberinit.set_ylabel("Bankrupt class")
plt.show()
In [22]:
df_train_final = pd.concat([new_train_y, new_train_X], axis=1, join='inner')
print(df_train_final.shape)
df_train_final.head()
(13198, 11)
Out[22]:
Bankrupt? ROA(C) before interest and depreciation before interest ROA(B) before interest and depreciation after tax Operating Gross Margin Persistent EPS in the Last Four Seasons Operating Profit Per Share (Yuan ¥) Net Value Growth Rate Debt ratio % Operating profit/Paid-in capital Quick Assets/Total Assets Current Liability to Assets
0 1 -2.217909 -2.400361 -0.383334 -1.794106 -0.471371 -0.013721 1.750845 -0.471275 -1.155833 1.126267
1 1 -0.673828 -0.598450 0.135068 -0.597379 -0.550058 -0.013721 1.075727 -0.548362 -1.351081 -0.670363
2 1 -1.303672 -1.319910 -0.383759 -1.450153 -0.599601 -0.013721 1.749724 -0.599655 -0.296712 0.148933
3 1 -1.735886 -1.556340 -1.441418 -1.055034 -1.121264 -0.013721 0.710131 -1.124902 -1.181075 0.159921
4 1 -0.661778 -0.508050 -0.541238 -0.489361 -0.436400 -0.013721 -0.123674 -0.433757 -0.692146 0.388219

7) Split the training data to actual_train and the validation part

In [23]:
# resplit the train data to training part and validation part (for model testing)
train_x, val_x, train_y, val_y = train_test_split(df_train_final.drop('Bankrupt?',axis=1),
                                                  df_train_final['Bankrupt?'],
                                                  test_size=0.3, random_state = 42)
In [24]:
# show the shape of the train and validation data
train_x.shape, val_x.shape, train_y.shape, val_y.shape
Out[24]:
((9238, 10), (3960, 10), (9238,), (3960,))
In [25]:
plot_status_numberinit = df_train['Bankrupt?'].value_counts().plot(title = 'Bankrupt vs Bankrupt class', kind = 'barh', color = 'green')
plot_status_numberinit.set_xlabel("Bankrupt?")
plot_status_numberinit.set_ylabel("y class")
plt.show()
print(final_train['Bankrupt?'].value_counts())
0    6599
1     220
Name: Bankrupt?, dtype: int64
In [26]:
df_test = df_test.loc[:, df_test.columns.isin(feature_list)]
df_test.drop([' Total Asset Growth Rate',' Revenue per person', ' After-tax Net Profit Growth Rate',' Quick Ratio', ' Research and development expense rate',
              ' Net Worth Turnover Rate (times)',' Long-term fund suitability ratio (A)',' Total Asset Return Growth Rate Ratio',
              ' Continuous interest rate (after tax)',' Average Collection Days',' Accounts Receivable Turnover', ' Interest Expense Ratio',' Current Ratio',' Realized Sales Gross Profit Growth Rate'], axis=1, inplace=True)
test_x = df_test.iloc[:,1:]
test_y = df_test.iloc[:,0]
In [27]:
# oversample the minority class, using the SMOTE method
oversample = SMOTE()
test_x, test_y = oversample.fit_resample(test_x, test_y)

plot_status_numberinit = test_y.value_counts().plot(title = 'Bankrupt vs Bankrupt class', kind = 'barh', color = 'green')
plot_status_numberinit.set_xlabel("Bankrupt")
plot_status_numberinit.set_ylabel("Bankrupt class")
plt.show()
In [28]:
train_y.shape
Out[28]:
(9238,)

3. Modeling

1) Logistic Regression -- Base Model 1

In [29]:
logistic_model = LogisticRegression(random_state = 42)
logistic_model.fit(train_x, train_y)
Out[29]:
LogisticRegression(random_state=42)
In [30]:
lg_base_probs = logistic_model.predict_proba(test_x)
lg_base_pred = logistic_model.predict(test_x)
lg_base_probs_prob = lg_base_probs[:,1]

print(confusion_matrix(lg_base_pred, test_y))
print(classification_report(lg_base_pred, test_y))
print(roc_auc_score(test_y, lg_base_probs_prob))
[[1730  309]
 [ 257 1678]]
              precision    recall  f1-score   support

           0       0.87      0.85      0.86      2039
           1       0.84      0.87      0.86      1935

    accuracy                           0.86      3974
   macro avg       0.86      0.86      0.86      3974
weighted avg       0.86      0.86      0.86      3974

0.9390882203877291
In [31]:
lg_y_train_pred = logistic_model.predict(train_x)
print(classification_report(train_y, lg_y_train_pred))
print(roc_auc_score(train_y, logistic_model.predict_proba(train_x)[:,1]))
              precision    recall  f1-score   support

           0       0.87      0.86      0.87      4594
           1       0.87      0.88      0.87      4644

    accuracy                           0.87      9238
   macro avg       0.87      0.87      0.87      9238
weighted avg       0.87      0.87      0.87      9238

0.9419159619876429

2) Random Forest Classifier -- Base Model 2

In [32]:
rf_model = RandomForestClassifier(random_state = 42)
rf_model.fit(train_x, train_y)
Out[32]:
RandomForestClassifier(random_state=42)
In [33]:
rf_base_probs = rf_model.predict_proba(test_x)
rf_base_pred = rf_model.predict(test_x)
rf_base_probs_prob = rf_base_probs[:,1]

print(confusion_matrix(rf_base_pred, test_y))
print(classification_report(rf_base_pred, test_y))
print(roc_auc_score(test_y, rf_base_probs_prob))
[[1958  216]
 [  29 1771]]
              precision    recall  f1-score   support

           0       0.99      0.90      0.94      2174
           1       0.89      0.98      0.94      1800

    accuracy                           0.94      3974
   macro avg       0.94      0.94      0.94      3974
weighted avg       0.94      0.94      0.94      3974

0.9902433254503543
In [34]:
rf_y_train_pred = rf_model.predict(train_x)
print(classification_report(train_y, rf_y_train_pred))
print(roc_auc_score(train_y, rf_model.predict_proba(train_x)[:,1]))
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4594
           1       1.00      1.00      1.00      4644

    accuracy                           1.00      9238
   macro avg       1.00      1.00      1.00      9238
weighted avg       1.00      1.00      1.00      9238

1.0
In [35]:
feature_importance = pd.DataFrame({'name': test_x.columns, 'importance': rf_model.feature_importances_})
feature_importance.sort_values('importance', ascending=False, inplace=True)
feature_importance = feature_importance.iloc[0:5,:]
feature_importance.iloc[:,0:4].plot.bar(x='name', y='importance')
Out[35]:
<AxesSubplot:xlabel='name'>

3) Artificial Neural Network

In [36]:
# early stopping
early_stop =  EarlyStopping(monitor='val_auc',mode='max', verbose=1, patience=27,restore_best_weights=True)

# ANN
model =  Sequential()

model.add(Dense(units=10,activation='relu'))
model.add(Dropout(0.10))

model.add(Dense(units=4,activation='relu'))

model.add(Dense(units=1,activation='sigmoid'))

# compile ANN
model.compile(loss='binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
In [37]:
# Train ANN
model.fit(x=train_x, 
          y=train_y, 
          epochs=120,
          validation_data=(val_x, val_y), verbose=1,
          callbacks=[early_stop]
          )
Epoch 1/120
289/289 [==============================] - 1s 1ms/step - loss: 0.6106 - accuracy: 0.7472 - val_loss: 0.5000 - val_accuracy: 0.8172
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 2/120
289/289 [==============================] - 0s 953us/step - loss: 0.4130 - accuracy: 0.8326 - val_loss: 0.3533 - val_accuracy: 0.8684
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 3/120
289/289 [==============================] - 0s 975us/step - loss: 0.3430 - accuracy: 0.8577 - val_loss: 0.3139 - val_accuracy: 0.8735
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 4/120
289/289 [==============================] - 0s 877us/step - loss: 0.3237 - accuracy: 0.8647 - val_loss: 0.3008 - val_accuracy: 0.8712
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 5/120
289/289 [==============================] - 0s 949us/step - loss: 0.3081 - accuracy: 0.8704 - val_loss: 0.2930 - val_accuracy: 0.8765
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 6/120
289/289 [==============================] - 0s 837us/step - loss: 0.3068 - accuracy: 0.8688 - val_loss: 0.2897 - val_accuracy: 0.8768
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 7/120
289/289 [==============================] - 0s 920us/step - loss: 0.3017 - accuracy: 0.8779 - val_loss: 0.2872 - val_accuracy: 0.8768
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 8/120
289/289 [==============================] - 0s 895us/step - loss: 0.3011 - accuracy: 0.8758 - val_loss: 0.2859 - val_accuracy: 0.8775
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 9/120
289/289 [==============================] - 0s 905us/step - loss: 0.2970 - accuracy: 0.8787 - val_loss: 0.2825 - val_accuracy: 0.8798
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 10/120
289/289 [==============================] - 0s 861us/step - loss: 0.2956 - accuracy: 0.8776 - val_loss: 0.2827 - val_accuracy: 0.8798
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 11/120
289/289 [==============================] - 0s 909us/step - loss: 0.2937 - accuracy: 0.8807 - val_loss: 0.2805 - val_accuracy: 0.8816
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 12/120
289/289 [==============================] - 0s 891us/step - loss: 0.2926 - accuracy: 0.8821 - val_loss: 0.2793 - val_accuracy: 0.8816
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 13/120
289/289 [==============================] - 0s 946us/step - loss: 0.2897 - accuracy: 0.8820 - val_loss: 0.2780 - val_accuracy: 0.8833
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 14/120
289/289 [==============================] - 0s 946us/step - loss: 0.2882 - accuracy: 0.8838 - val_loss: 0.2767 - val_accuracy: 0.8826
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 15/120
289/289 [==============================] - 0s 906us/step - loss: 0.2889 - accuracy: 0.8824 - val_loss: 0.2748 - val_accuracy: 0.8869
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 16/120
289/289 [==============================] - 0s 907us/step - loss: 0.2888 - accuracy: 0.8826 - val_loss: 0.2765 - val_accuracy: 0.8859
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 17/120
289/289 [==============================] - 0s 824us/step - loss: 0.2877 - accuracy: 0.8816 - val_loss: 0.2733 - val_accuracy: 0.8869
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 18/120
289/289 [==============================] - 0s 880us/step - loss: 0.2879 - accuracy: 0.8849 - val_loss: 0.2734 - val_accuracy: 0.8874
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 19/120
289/289 [==============================] - 0s 919us/step - loss: 0.2831 - accuracy: 0.8873 - val_loss: 0.2720 - val_accuracy: 0.8902
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 20/120
289/289 [==============================] - 0s 917us/step - loss: 0.2838 - accuracy: 0.8864 - val_loss: 0.2720 - val_accuracy: 0.8896
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 21/120
289/289 [==============================] - 0s 908us/step - loss: 0.2818 - accuracy: 0.8877 - val_loss: 0.2710 - val_accuracy: 0.8904
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 22/120
289/289 [==============================] - 0s 975us/step - loss: 0.2803 - accuracy: 0.8881 - val_loss: 0.2705 - val_accuracy: 0.8896
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 23/120
289/289 [==============================] - 0s 938us/step - loss: 0.2804 - accuracy: 0.8866 - val_loss: 0.2710 - val_accuracy: 0.8909
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 24/120
289/289 [==============================] - 0s 872us/step - loss: 0.2795 - accuracy: 0.8892 - val_loss: 0.2702 - val_accuracy: 0.8899
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 25/120
289/289 [==============================] - 0s 894us/step - loss: 0.2829 - accuracy: 0.8889 - val_loss: 0.2698 - val_accuracy: 0.8902
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 26/120
289/289 [==============================] - 0s 940us/step - loss: 0.2800 - accuracy: 0.8885 - val_loss: 0.2691 - val_accuracy: 0.8917
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 27/120
289/289 [==============================] - 0s 852us/step - loss: 0.2811 - accuracy: 0.8905 - val_loss: 0.2690 - val_accuracy: 0.8904
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 28/120
289/289 [==============================] - 0s 890us/step - loss: 0.2808 - accuracy: 0.8879 - val_loss: 0.2690 - val_accuracy: 0.8899
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 29/120
289/289 [==============================] - 0s 867us/step - loss: 0.2796 - accuracy: 0.8876 - val_loss: 0.2685 - val_accuracy: 0.8904
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 30/120
289/289 [==============================] - 0s 871us/step - loss: 0.2800 - accuracy: 0.8868 - val_loss: 0.2678 - val_accuracy: 0.8904
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 31/120
289/289 [==============================] - 0s 902us/step - loss: 0.2767 - accuracy: 0.8888 - val_loss: 0.2679 - val_accuracy: 0.8907
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 32/120
289/289 [==============================] - 0s 896us/step - loss: 0.2769 - accuracy: 0.8901 - val_loss: 0.2671 - val_accuracy: 0.8932
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 33/120
289/289 [==============================] - 0s 899us/step - loss: 0.2796 - accuracy: 0.8879 - val_loss: 0.2663 - val_accuracy: 0.8924
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 34/120
289/289 [==============================] - 0s 883us/step - loss: 0.2766 - accuracy: 0.8905 - val_loss: 0.2660 - val_accuracy: 0.8934
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 35/120
289/289 [==============================] - 0s 934us/step - loss: 0.2749 - accuracy: 0.8900 - val_loss: 0.2668 - val_accuracy: 0.8904
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 36/120
289/289 [==============================] - 0s 854us/step - loss: 0.2764 - accuracy: 0.8895 - val_loss: 0.2644 - val_accuracy: 0.8942
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 37/120
289/289 [==============================] - 0s 911us/step - loss: 0.2744 - accuracy: 0.8895 - val_loss: 0.2641 - val_accuracy: 0.8944
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 38/120
289/289 [==============================] - 0s 892us/step - loss: 0.2735 - accuracy: 0.8900 - val_loss: 0.2639 - val_accuracy: 0.8924
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 39/120
289/289 [==============================] - 0s 870us/step - loss: 0.2730 - accuracy: 0.8927 - val_loss: 0.2624 - val_accuracy: 0.8924
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 40/120
289/289 [==============================] - 0s 839us/step - loss: 0.2736 - accuracy: 0.8913 - val_loss: 0.2615 - val_accuracy: 0.8934
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 41/120
289/289 [==============================] - 0s 913us/step - loss: 0.2731 - accuracy: 0.8908 - val_loss: 0.2624 - val_accuracy: 0.8904
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 42/120
289/289 [==============================] - 0s 1ms/step - loss: 0.2717 - accuracy: 0.8920 - val_loss: 0.2602 - val_accuracy: 0.8932
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 43/120
289/289 [==============================] - 0s 904us/step - loss: 0.2723 - accuracy: 0.8897 - val_loss: 0.2603 - val_accuracy: 0.8932
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 44/120
289/289 [==============================] - 0s 912us/step - loss: 0.2694 - accuracy: 0.8924 - val_loss: 0.2595 - val_accuracy: 0.8927
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 45/120
289/289 [==============================] - 0s 919us/step - loss: 0.2681 - accuracy: 0.8925 - val_loss: 0.2601 - val_accuracy: 0.8894
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 46/120
289/289 [==============================] - 0s 874us/step - loss: 0.2696 - accuracy: 0.8896 - val_loss: 0.2586 - val_accuracy: 0.8927
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 47/120
289/289 [==============================] - 0s 897us/step - loss: 0.2704 - accuracy: 0.8911 - val_loss: 0.2596 - val_accuracy: 0.8909
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 48/120
289/289 [==============================] - 0s 913us/step - loss: 0.2684 - accuracy: 0.8886 - val_loss: 0.2584 - val_accuracy: 0.8942
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 49/120
289/289 [==============================] - 0s 912us/step - loss: 0.2672 - accuracy: 0.8916 - val_loss: 0.2586 - val_accuracy: 0.8907
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 50/120
289/289 [==============================] - 0s 951us/step - loss: 0.2669 - accuracy: 0.8918 - val_loss: 0.2573 - val_accuracy: 0.8927
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 51/120
289/289 [==============================] - 0s 836us/step - loss: 0.2676 - accuracy: 0.8906 - val_loss: 0.2571 - val_accuracy: 0.8929
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 52/120
289/289 [==============================] - 0s 857us/step - loss: 0.2662 - accuracy: 0.8920 - val_loss: 0.2563 - val_accuracy: 0.8924
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 53/120
289/289 [==============================] - 0s 910us/step - loss: 0.2673 - accuracy: 0.8918 - val_loss: 0.2558 - val_accuracy: 0.8939
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 54/120
289/289 [==============================] - 0s 916us/step - loss: 0.2681 - accuracy: 0.8902 - val_loss: 0.2566 - val_accuracy: 0.8937
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 55/120
289/289 [==============================] - 0s 913us/step - loss: 0.2692 - accuracy: 0.8900 - val_loss: 0.2573 - val_accuracy: 0.8909
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 56/120
289/289 [==============================] - 0s 881us/step - loss: 0.2669 - accuracy: 0.8894 - val_loss: 0.2561 - val_accuracy: 0.8907
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 57/120
289/289 [==============================] - 0s 925us/step - loss: 0.2673 - accuracy: 0.8915 - val_loss: 0.2554 - val_accuracy: 0.8919
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 58/120
289/289 [==============================] - 0s 939us/step - loss: 0.2649 - accuracy: 0.8913 - val_loss: 0.2559 - val_accuracy: 0.8909
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 59/120
289/289 [==============================] - 0s 926us/step - loss: 0.2652 - accuracy: 0.8913 - val_loss: 0.2540 - val_accuracy: 0.8929
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 60/120
289/289 [==============================] - 0s 893us/step - loss: 0.2630 - accuracy: 0.8909 - val_loss: 0.2525 - val_accuracy: 0.8929
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 61/120
289/289 [==============================] - 0s 894us/step - loss: 0.2628 - accuracy: 0.8903 - val_loss: 0.2524 - val_accuracy: 0.8914
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 62/120
289/289 [==============================] - 0s 1ms/step - loss: 0.2630 - accuracy: 0.8922 - val_loss: 0.2527 - val_accuracy: 0.8929
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 63/120
289/289 [==============================] - 0s 855us/step - loss: 0.2622 - accuracy: 0.8911 - val_loss: 0.2514 - val_accuracy: 0.8937
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 64/120
289/289 [==============================] - 0s 901us/step - loss: 0.2620 - accuracy: 0.8928 - val_loss: 0.2507 - val_accuracy: 0.8944
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 65/120
289/289 [==============================] - 0s 901us/step - loss: 0.2631 - accuracy: 0.8927 - val_loss: 0.2522 - val_accuracy: 0.8912
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 66/120
289/289 [==============================] - 0s 898us/step - loss: 0.2602 - accuracy: 0.8934 - val_loss: 0.2510 - val_accuracy: 0.8952
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 67/120
289/289 [==============================] - 0s 882us/step - loss: 0.2618 - accuracy: 0.8929 - val_loss: 0.2501 - val_accuracy: 0.8937
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 68/120
289/289 [==============================] - 0s 909us/step - loss: 0.2607 - accuracy: 0.8947 - val_loss: 0.2513 - val_accuracy: 0.8934
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 69/120
289/289 [==============================] - 0s 952us/step - loss: 0.2569 - accuracy: 0.8946 - val_loss: 0.2504 - val_accuracy: 0.8929
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 70/120
289/289 [==============================] - 0s 904us/step - loss: 0.2601 - accuracy: 0.8919 - val_loss: 0.2480 - val_accuracy: 0.8965
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 71/120
289/289 [==============================] - 0s 890us/step - loss: 0.2595 - accuracy: 0.8920 - val_loss: 0.2483 - val_accuracy: 0.8952
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 72/120
289/289 [==============================] - 0s 909us/step - loss: 0.2588 - accuracy: 0.8945 - val_loss: 0.2476 - val_accuracy: 0.8962
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 73/120
289/289 [==============================] - 0s 895us/step - loss: 0.2589 - accuracy: 0.8918 - val_loss: 0.2473 - val_accuracy: 0.8944
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 74/120
289/289 [==============================] - 0s 897us/step - loss: 0.2594 - accuracy: 0.8924 - val_loss: 0.2469 - val_accuracy: 0.8957
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 75/120
289/289 [==============================] - 0s 873us/step - loss: 0.2602 - accuracy: 0.8922 - val_loss: 0.2464 - val_accuracy: 0.8967
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 76/120
289/289 [==============================] - 0s 900us/step - loss: 0.2590 - accuracy: 0.8935 - val_loss: 0.2460 - val_accuracy: 0.8952
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 77/120
289/289 [==============================] - 0s 937us/step - loss: 0.2587 - accuracy: 0.8936 - val_loss: 0.2470 - val_accuracy: 0.8972
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 78/120
289/289 [==============================] - 0s 886us/step - loss: 0.2559 - accuracy: 0.8954 - val_loss: 0.2461 - val_accuracy: 0.8992
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 79/120
289/289 [==============================] - 0s 878us/step - loss: 0.2605 - accuracy: 0.8920 - val_loss: 0.2467 - val_accuracy: 0.8944
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 80/120
289/289 [==============================] - 0s 921us/step - loss: 0.2579 - accuracy: 0.8952 - val_loss: 0.2459 - val_accuracy: 0.8942
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 81/120
289/289 [==============================] - 0s 911us/step - loss: 0.2583 - accuracy: 0.8928 - val_loss: 0.2457 - val_accuracy: 0.8967
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 82/120
289/289 [==============================] - 0s 1ms/step - loss: 0.2542 - accuracy: 0.8948 - val_loss: 0.2456 - val_accuracy: 0.8965
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 83/120
289/289 [==============================] - 0s 897us/step - loss: 0.2574 - accuracy: 0.8933 - val_loss: 0.2448 - val_accuracy: 0.8970
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 84/120
289/289 [==============================] - 0s 872us/step - loss: 0.2557 - accuracy: 0.8948 - val_loss: 0.2457 - val_accuracy: 0.8965
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 85/120
289/289 [==============================] - 0s 890us/step - loss: 0.2548 - accuracy: 0.8963 - val_loss: 0.2456 - val_accuracy: 0.8965
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 86/120
289/289 [==============================] - 0s 869us/step - loss: 0.2573 - accuracy: 0.8946 - val_loss: 0.2457 - val_accuracy: 0.8972
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 87/120
289/289 [==============================] - 0s 894us/step - loss: 0.2556 - accuracy: 0.8956 - val_loss: 0.2451 - val_accuracy: 0.8944
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 88/120
289/289 [==============================] - 0s 896us/step - loss: 0.2556 - accuracy: 0.8925 - val_loss: 0.2439 - val_accuracy: 0.8972
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 89/120
289/289 [==============================] - 0s 935us/step - loss: 0.2569 - accuracy: 0.8931 - val_loss: 0.2456 - val_accuracy: 0.8977
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 90/120
289/289 [==============================] - 0s 895us/step - loss: 0.2568 - accuracy: 0.8935 - val_loss: 0.2458 - val_accuracy: 0.8942
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 91/120
289/289 [==============================] - 0s 893us/step - loss: 0.2545 - accuracy: 0.8949 - val_loss: 0.2433 - val_accuracy: 0.8980
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 92/120
289/289 [==============================] - 0s 888us/step - loss: 0.2542 - accuracy: 0.8962 - val_loss: 0.2463 - val_accuracy: 0.8952
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 93/120
289/289 [==============================] - 0s 880us/step - loss: 0.2570 - accuracy: 0.8949 - val_loss: 0.2433 - val_accuracy: 0.8967
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 94/120
289/289 [==============================] - 0s 895us/step - loss: 0.2539 - accuracy: 0.8943 - val_loss: 0.2441 - val_accuracy: 0.8962
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 95/120
289/289 [==============================] - 0s 887us/step - loss: 0.2548 - accuracy: 0.8961 - val_loss: 0.2438 - val_accuracy: 0.8972
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 96/120
289/289 [==============================] - 0s 929us/step - loss: 0.2537 - accuracy: 0.8950 - val_loss: 0.2445 - val_accuracy: 0.8965
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 97/120
289/289 [==============================] - 0s 876us/step - loss: 0.2543 - accuracy: 0.8952 - val_loss: 0.2430 - val_accuracy: 0.8960
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 98/120
289/289 [==============================] - 0s 916us/step - loss: 0.2524 - accuracy: 0.8959 - val_loss: 0.2423 - val_accuracy: 0.8970
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 99/120
289/289 [==============================] - 0s 929us/step - loss: 0.2545 - accuracy: 0.8939 - val_loss: 0.2431 - val_accuracy: 0.8980
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 100/120
289/289 [==============================] - 0s 893us/step - loss: 0.2548 - accuracy: 0.8949 - val_loss: 0.2429 - val_accuracy: 0.8995
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 101/120
289/289 [==============================] - 0s 931us/step - loss: 0.2520 - accuracy: 0.8962 - val_loss: 0.2420 - val_accuracy: 0.8955
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 102/120
289/289 [==============================] - 0s 1ms/step - loss: 0.2517 - accuracy: 0.8950 - val_loss: 0.2408 - val_accuracy: 0.9000
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 103/120
289/289 [==============================] - 0s 918us/step - loss: 0.2574 - accuracy: 0.8942 - val_loss: 0.2406 - val_accuracy: 0.8997
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 104/120
289/289 [==============================] - 0s 916us/step - loss: 0.2537 - accuracy: 0.8964 - val_loss: 0.2406 - val_accuracy: 0.9013
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 105/120
289/289 [==============================] - 0s 893us/step - loss: 0.2502 - accuracy: 0.8972 - val_loss: 0.2407 - val_accuracy: 0.9013
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 106/120
289/289 [==============================] - 0s 884us/step - loss: 0.2515 - accuracy: 0.8954 - val_loss: 0.2423 - val_accuracy: 0.8972
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 107/120
289/289 [==============================] - 0s 921us/step - loss: 0.2547 - accuracy: 0.8940 - val_loss: 0.2426 - val_accuracy: 0.8952
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 108/120
289/289 [==============================] - 0s 828us/step - loss: 0.2530 - accuracy: 0.8926 - val_loss: 0.2399 - val_accuracy: 0.9015
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 109/120
289/289 [==============================] - 0s 871us/step - loss: 0.2519 - accuracy: 0.8964 - val_loss: 0.2399 - val_accuracy: 0.9000
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 110/120
289/289 [==============================] - 0s 887us/step - loss: 0.2534 - accuracy: 0.8955 - val_loss: 0.2401 - val_accuracy: 0.8990
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 111/120
289/289 [==============================] - 0s 898us/step - loss: 0.2500 - accuracy: 0.8973 - val_loss: 0.2397 - val_accuracy: 0.9003
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 112/120
289/289 [==============================] - 0s 1ms/step - loss: 0.2504 - accuracy: 0.8958 - val_loss: 0.2389 - val_accuracy: 0.9008
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 113/120
289/289 [==============================] - 0s 878us/step - loss: 0.2501 - accuracy: 0.8973 - val_loss: 0.2388 - val_accuracy: 0.9003
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 114/120
289/289 [==============================] - 0s 875us/step - loss: 0.2539 - accuracy: 0.8947 - val_loss: 0.2394 - val_accuracy: 0.9008
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 115/120
289/289 [==============================] - 0s 905us/step - loss: 0.2534 - accuracy: 0.8955 - val_loss: 0.2381 - val_accuracy: 0.9010
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 116/120
289/289 [==============================] - 0s 947us/step - loss: 0.2526 - accuracy: 0.8977 - val_loss: 0.2388 - val_accuracy: 0.8997
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 117/120
289/289 [==============================] - 0s 909us/step - loss: 0.2517 - accuracy: 0.8943 - val_loss: 0.2380 - val_accuracy: 0.9005
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 118/120
289/289 [==============================] - 0s 906us/step - loss: 0.2496 - accuracy: 0.8981 - val_loss: 0.2381 - val_accuracy: 0.9013
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 119/120
289/289 [==============================] - 0s 850us/step - loss: 0.2476 - accuracy: 0.8958 - val_loss: 0.2390 - val_accuracy: 0.8957
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Epoch 120/120
289/289 [==============================] - 0s 853us/step - loss: 0.2524 - accuracy: 0.8961 - val_loss: 0.2371 - val_accuracy: 0.9000
WARNING:tensorflow:Early stopping conditioned on metric `val_auc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
Out[37]:
<tensorflow.python.keras.callbacks.History at 0x26eae296ca0>
In [38]:
# model history to df
loss_plot = pd.DataFrame(model.history.history)
accuracy_plot = pd.DataFrame(model.history.history)

#  accuracy and loss plot
fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(14,4))
plt.style.use('seaborn')
ax1.plot(loss_plot.loc[:, ['loss']], label='Training loss');
ax1.plot(loss_plot.loc[:, ['val_loss']],label='Validation loss');
ax1.set_title('Training and Validation loss')
ax1.set_xlabel('epochs')
ax1.set_ylabel('Loss')
ax1.legend(loc="best");

ax2.plot(accuracy_plot.loc[:, ['accuracy']],label='Training_accuracy');
ax2.plot(accuracy_plot.loc[:, ['val_accuracy']], label='Validation_accuracy');
ax2.set_title('Training_and_Validation_accuracy');
ax2.set_xlabel('epochs')
ax2.set_ylabel('accuracy')
ax2.legend(loc="best");
In [39]:
y_pred = model.predict(test_x)
y_pred = (y_pred > 0.5)
In [40]:
test_x.shape
Out[40]:
(3974, 10)
In [41]:
sns.heatmap(confusion_matrix(test_y,y_pred,normalize='true'), annot=True)
Out[41]:
<AxesSubplot:>
In [42]:
print(classification_report(test_y, y_pred))
print(roc_auc_score(test_y, model.predict_proba(test_x)))
              precision    recall  f1-score   support

           0       0.89      0.87      0.88      1987
           1       0.87      0.89      0.88      1987

    accuracy                           0.88      3974
   macro avg       0.88      0.88      0.88      3974
weighted avg       0.88      0.88      0.88      3974

0.9523693134716371
In [51]:
ann_y_train_pred = model.predict(train_x)
ann_y_train_pred = (ann_y_train_pred > 0.5)
print(classification_report(train_y, ann_y_train_pred))
print(roc_auc_score(train_y, model.predict_proba(train_x)))
              precision    recall  f1-score   support

           0       0.93      0.86      0.90      4594
           1       0.87      0.94      0.90      4644

    accuracy                           0.90      9238
   macro avg       0.90      0.90      0.90      9238
weighted avg       0.90      0.90      0.90      9238

0.9613078062724214

4) Gradient Boosting Classifier -- GridSearchCV + RepeatedStratifiedKFold

In [29]:
gb_model = GradientBoostingClassifier()

parameters = {
        'n_estimators': [100, 200, 300, 400],
        'learning_rate': [0.2,0.4,0.6],
        'max_depth': [1,2]
    
}

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

gb_grid = GridSearchCV(gb_model, parameters, cv=cv, scoring='roc_auc',refit=True, n_jobs=-1, verbose=5)
gb_grid.fit(train_x, train_y)
Fitting 30 folds for each of 24 candidates, totalling 720 fits
Out[29]:
GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=10, random_state=1),
             estimator=GradientBoostingClassifier(), n_jobs=-1,
             param_grid={'learning_rate': [0.2, 0.4, 0.6], 'max_depth': [1, 2],
                         'n_estimators': [100, 200, 300, 400]},
             scoring='roc_auc', verbose=5)
In [30]:
gb_model2 = gb_grid.best_estimator_
gb_model2.fit(train_x, train_y)
gb_model2
Out[30]:
GradientBoostingClassifier(learning_rate=0.4, max_depth=2, n_estimators=400)
In [31]:
gb_base_probs = gb_model2.predict_proba(test_x)
gb_base_pred = gb_model2.predict(test_x)
gb_base_probs_prob = gb_base_probs[:,1]

print(confusion_matrix(gb_base_pred, test_y))
print(classification_report(gb_base_pred, test_y))
print(roc_auc_score(test_y, gb_base_probs_prob))
[[1888  241]
 [  99 1746]]
              precision    recall  f1-score   support

           0       0.95      0.89      0.92      2129
           1       0.88      0.95      0.91      1845

    accuracy                           0.91      3974
   macro avg       0.91      0.92      0.91      3974
weighted avg       0.92      0.91      0.91      3974

0.9756502317909898
In [32]:
gb_y_train_pred = gb_model2.predict(train_x)
print(classification_report(train_y, gb_y_train_pred))
print(roc_auc_score(train_y, gb_model2.predict_proba(train_x)[:,1]))
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      4594
           1       0.97      0.99      0.98      4644

    accuracy                           0.98      9238
   macro avg       0.98      0.98      0.98      9238
weighted avg       0.98      0.98      0.98      9238

0.9974765328854587
In [44]:
from pydotplus import graph_from_dot_data
from sklearn.tree import export_graphviz
from IPython.display import Image

# Get the tree number 42
sub_tree_42 = gb_model2.estimators_[1, 0]

dot_data = export_graphviz(
    sub_tree_42,
    out_file=None, filled=True, rounded=True,
    special_characters=True,
    proportion=False, impurity=False, # enable them if you want
)
graph = graph_from_dot_data(dot_data)
Image(graph.create_png())
Out[44]:
In [42]:
# get importance
importance = gb_model2.feature_importances_

# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))
    
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()
Feature: 0, Score: 0.03468
Feature: 1, Score: 0.03745
Feature: 2, Score: 0.02776
Feature: 3, Score: 0.60258
Feature: 4, Score: 0.01134
Feature: 5, Score: 0.00004
Feature: 6, Score: 0.21207
Feature: 7, Score: 0.02400
Feature: 8, Score: 0.02947
Feature: 9, Score: 0.02062
In [48]:
feature_importance = pd.DataFrame({'name': test_x.columns, 'importance': gb_model2.feature_importances_})
feature_importance.sort_values('importance', ascending=False, inplace=True)
feature_importance = feature_importance.iloc[0:5,:]
feature_importance.iloc[:,0:4].plot.bar(x='name', y='importance')
Out[48]:
<AxesSubplot:xlabel='name'>

5) Ada Boost Classifier -- GridSearchCV + RepeatedStratifiedKFold

In [53]:
# create Random Forest model 
adb = AdaBoostClassifier()

param_grid = {'n_estimators':[100,200,300,400],
              'learning_rate':[0.2,0.4,0.6,0.8,1,1.2],
              'random_state': [0]}

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# Create gridsearch object with various combinations of parameters
adb_Grid = GridSearchCV(adb, param_grid, cv = cv, scoring = 'roc_auc',refit = True, n_jobs=-1, verbose = 5)
adb_Grid.fit(X_train, y_train)
Fitting 30 folds for each of 24 candidates, totalling 720 fits
Out[53]:
GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=10, random_state=1),
             estimator=AdaBoostClassifier(), n_jobs=-1,
             param_grid={'learning_rate': [0.2, 0.4, 0.6, 0.8, 1, 1.2],
                         'n_estimators': [100, 200, 300, 400],
                         'random_state': [0]},
             scoring='roc_auc', verbose=5)
In [54]:
adb_model2 = adb_Grid.best_estimator_
adb_model2.fit(train_x, train_y)
adb_model2
Out[54]:
AdaBoostClassifier(learning_rate=0.2, n_estimators=100, random_state=0)
In [71]:
adb_base_probs = adb_model2.predict_proba(test_x)
adb_base_pred = adb_model2.predict(test_x)
adb_base_probs_prob = adb_base_probs[:,1]

print(confusion_matrix(adb_base_pred, test_y))
print(classification_report(adb_base_pred, test_y))
print(roc_auc_score(test_y, adb_base_probs_prob))
[[1722  254]
 [ 265 1733]]
              precision    recall  f1-score   support

           0       0.87      0.87      0.87      1976
           1       0.87      0.87      0.87      1998

    accuracy                           0.87      3974
   macro avg       0.87      0.87      0.87      3974
weighted avg       0.87      0.87      0.87      3974

0.942600861310648
In [77]:
adb_y_train_pred = adb_model2.predict(train_x)
print(classification_report(train_y, adb_y_train_pred))
print(roc_auc_score(train_y, adb_model2.predict_proba(train_x)[:,1]))
              precision    recall  f1-score   support

           0       0.89      0.86      0.87      4594
           1       0.87      0.89      0.88      4644

    accuracy                           0.88      9238
   macro avg       0.88      0.88      0.88      9238
weighted avg       0.88      0.88      0.88      9238

0.9512776607843733
In [56]:
feature_importance = pd.DataFrame({'name': test_x.columns, 'importance': ada_model.feature_importances_})
feature_importance.sort_values('importance', ascending=False, inplace=True)
feature_importance = feature_importance.iloc[0:5,:]
feature_importance.iloc[:,0:4].plot.bar(x='name', y='importance')
Out[56]:
<AxesSubplot:xlabel='name'>

5. XGBoost - RandomizedSearchCV + RepeatedStratifiedKFold

In [60]:
xgb = xgb.XGBClassifier()

param_grid_2 = {'n_estimators':[int(x) for x in np.linspace(start = 100, stop = 1000, num = 19)],
                'learning_rate':[x for x in np.linspace(start = 0.1, stop = 1.6, num = 16)],
                'max_depth':[1,2],
                'gamma':[x for x in np.linspace(start = 0, stop = 5, num = 21)]}

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# Create gridsearch object with various combinations of parameters
xgb_Grid = RandomizedSearchCV(xgb, param_grid_2, cv = cv, scoring = 'roc_auc',refit = True, n_jobs=-1, verbose = 5)
xgb_Grid.fit(X_train, y_train)
Fitting 30 folds for each of 10 candidates, totalling 300 fits
[18:25:01] WARNING: ..\src\learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
Out[60]:
RandomizedSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=10, random_state=1),
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weigh...
                                                  2.5, 2.75, 3.0, 3.25, 3.5,
                                                  3.75, 4.0, 4.25, 4.5, 4.75,
                                                  5.0],
                                        'learning_rate': [0.1, 0.2,
                                                          0.30000000000000004,
                                                          0.4, 0.5, 0.6,
                                                          0.7000000000000001,
                                                          0.8, 0.9, 1.0, 1.1,
                                                          1.2000000000000002,
                                                          1.3000000000000003,
                                                          1.4000000000000001,
                                                          1.5000000000000002,
                                                          1.6],
                                        'max_depth': [1, 2],
                                        'n_estimators': [100, 150, 200, 250,
                                                         300, 350, 400, 450,
                                                         500, 550, 600, 650,
                                                         700, 750, 800, 850,
                                                         900, 950, 1000]},
                   scoring='roc_auc', verbose=5)
In [61]:
xgb_model2 = xgb_Grid.best_estimator_
xgb_model2.fit(train_x, train_y)
xgb_model2
[18:25:38] WARNING: ..\src\learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
Out[61]:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=2.0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=1,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=300, n_jobs=12, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)
In [72]:
xgb_base_probs = xgb_model2.predict_proba(test_x)
xgb_base_pred = xgb_model2.predict(test_x)
xgb_base_probs_prob = xgb_base_probs[:,1]

print(confusion_matrix(xgb_base_pred, test_y))
print(classification_report(xgb_base_pred, test_y))
print(roc_auc_score(test_y, xgb_base_probs_prob))
[[1720  244]
 [ 267 1743]]
              precision    recall  f1-score   support

           0       0.87      0.88      0.87      1964
           1       0.88      0.87      0.87      2010

    accuracy                           0.87      3974
   macro avg       0.87      0.87      0.87      3974
weighted avg       0.87      0.87      0.87      3974

0.9434907674924756
In [78]:
xgb_y_train_pred = xgb_model2.predict(train_x)
print(classification_report(train_y, xgb_y_train_pred))
print(roc_auc_score(train_y, xgb_model2.predict_proba(train_x)[:,1]))
              precision    recall  f1-score   support

           0       0.91      0.86      0.88      4594
           1       0.87      0.92      0.89      4644

    accuracy                           0.89      9238
   macro avg       0.89      0.89      0.89      9238
weighted avg       0.89      0.89      0.89      9238

0.9524851630239345
In [63]:
feature_importance = pd.DataFrame({'name': test_x.columns, 'importance': ada_model.feature_importances_})
feature_importance.sort_values('importance', ascending=False, inplace=True)
feature_importance = feature_importance.iloc[0:5,:]
feature_importance.iloc[:,0:4].plot.bar(x='name', y='importance')
Out[63]:
<AxesSubplot:xlabel='name'>

Model Evaluation

In [53]:
# list of strings
list0 = ['Name', 'Test_f1_score', 'Test_roc_auc_score', 'Train_f1_score', 'Train_roc_auc_score']
list1 = ['Logistic Regression', 0.86, 0.938, 0.87, 0.94]
list2 = ['Random Forest', 0.94, 0.989, 1, 1]
list3 = ['ANN', '0.88', 0.952, 0.90, 0.961]
list4 = ['Gradient Boost', 0.92, 0.977, 0.98, 0.997]
list5 = ['Ada Boost', 0.87, 0.9426, 0.88, 0.951]
list6 = ['XGBoost', 0.87, 0.943, 0.89, 0.952]
  
# Calling DataFrame constructor after zipping
# both lists, with columns specified
df = pd.DataFrame(list(zip(list0, list1, list2, list3, list4, list5, list6)))
In [54]:
df = df.T
df
Out[54]:
0 1 2 3 4
0 Name Test_f1_score Test_roc_auc_score Train_f1_score Train_roc_auc_score
1 Logistic Regression 0.86 0.938 0.87 0.94
2 Random Forest 0.94 0.989 1 1
3 ANN 0.88 0.952 0.9 0.961
4 Gradient Boost 0.92 0.977 0.98 0.997
5 Ada Boost 0.87 0.9426 0.88 0.951
6 XGBoost 0.87 0.943 0.89 0.952
In [55]:
new_header = df.iloc[0] 
df = df[1:]
df.columns = new_header 
df.rename(columns=new_header)
Out[55]:
Name Test_f1_score Test_roc_auc_score Train_f1_score Train_roc_auc_score
1 Logistic Regression 0.86 0.938 0.87 0.94
2 Random Forest 0.94 0.989 1 1
3 ANN 0.88 0.952 0.9 0.961
4 Gradient Boost 0.92 0.977 0.98 0.997
5 Ada Boost 0.87 0.9426 0.88 0.951
6 XGBoost 0.87 0.943 0.89 0.952